#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import lxml.html
import sys
reload(sys)
sys.setdefaultencoding( "utf-8")
book_id = '3'
book = 'https://www.shanbay.com/wordbook/'


def get_book_unit(book):
    html = requests.get(book+book_id+'/').text
    selector = lxml.html.fromstring(html)
    wordlist = selector.xpath('//td[@class="wordbook-wordlist-name"]/a/@href')
    for ids, wl in enumerate(wordlist):
        wordlist[ids] = 'https://www.shanbay.com' + wl
    return wordlist


urls = get_book_unit(book)
for idx, each in enumerate(urls):
    f = open('words/' +book_id+ '.txt', 'a')
    for i in range(10):
        url = each + '/?page={}'.format(i + 1)
        html = requests.get(url).text
        selector = lxml.html.fromstring(html)
        meanings = selector.xpath('//tbody/tr[@class="row"]/td[@class="span10"]/text()')
        # meanings = [m.replace('\n', ' ') for m in meanings]
        words = selector.xpath('//tbody/tr[@class="row"]/td[@class="span2"]/strong/text()')
        if len(words) > 0:
            for index in range(len(words)):
                meaning = meanings[index].replace("\n", " ")
                f.write(words[index] + ' ' + meaning + "\n")